*** Open file "moe_clean.student.leavers"

* Renaming and labeling variables
rename moe_sl_sex_snz_code sex
label variable sex "moe_sl_sex_snz_code"
rename moe_sl_leaver_age age
label variable age "moe_sl_leaver_age"
rename moe_sl_eligibility_code elig
label variable elig "moe_sl_eligibility_code"
rename moe_sl_leaving_yr_lvl yearlevel
label variable yearlevel "moe_sl_leaving_yr_lvl"
rename moe_sl_leaving_reason_code reason
label variable reason "moe_sl_leaving_reason_code"
rename moe_sl_time_in_enrol daysenrolled
label variable daysenrolled "moe_sl_time_in_enrol"
rename moe_sl_provider_code schoolno
label variable schoolno "moe_sl_provider_code"
rename moe_sl_leaver_year leaveryear
label variable leaveryear "moe_sl_leaver_year"
rename moe_sl_leaver_count_options_code moecode
label variable moecode "moe_sl_leaver_count_options_code"
rename moe_sl_refugee_status status
label variable status "moe_sl_refugee_status"
rename moe_sl_ue_entrance_code ue
label variable ue "moe_sl_ue_entrance_code"
rename moe_sl_highest_attain_code attainment
label variable attainment "moe_sl_highest_attain_code"
rename moe_sl_first_day_attend firstday
label variable firstday "moe_sl_highest_attain_code"
rename moe_sl_last_day_attend lastday
label variable lastday "moe_sl_last_day_attend"
rename moe_sl_extract_date extractdate
label variable extractdate "moe_sl_extract_date"

notes sex : Male=1; Female=2

* Generating University Entrance (UE) dummy variable from the UE string variable
encode ue, gen(uee)
replace uee=0 if uee==1
replace uee=1 if uee==2
drop ue
gen ue=uee
drop uee
notes ue : Yes=1; No=0

* Converting 'elig' string variable to numeric
encode elig, gen(eligg)
drop elig
rename eligg elig
notes elig : Domestic=1; Exchange=2; IFP=3

* Converting 'status' string variable to numeric
encode status, gen(statuss)
drop status
rename statuss status
notes status : Migrant=1; NZBorn=2; Refugee=3

* Restricting the reason for leaving school to 'end of schooling (L)'. dropping all other observations.
keep if reason=="L"


tab reason

* Restricting the sample to domestic students only. dropping all other observations. (elig: Domestic=1; Exchange=2; IFP=3)
keep if elig==1

tab elig
tab status

* Dropping Migrants
drop if status==1

* Dropping Refugees
drop if status==3

* Note: the above three restrictions (domestic, migrants, refugees) change while doing the falsification checks. Details to follow

* Creating 'ncea' to classify 'attainment' variable into 4 categories: NCEA Level 1 not achieved=0; atleast NCEA Level 1 achieved=1; atleast NCEA Level 2 achieved=2; atleast NCEA Level 3 achieved=3.
tab attainment
gen ncea=.
replace ncea=0 if attainment==0 | attainment==10 | attainment==20 | attainment==51 | attainment==52 | attainment==53
replace ncea=1 if attainment==13 | attainment==14 | attainment==15 | attainment==16 | attainment==17 | attainment==30 | attainment==60 | attainment==70 | attainment==80 | attainment==90
replace ncea=2 if attainment==24 | attainment==25 | attainment==26 | attainment==27 | attainment==4 | attainment==61 | attainment==71 | attainment==81 | attainment==91
replace ncea=3 if attainment==33 | attainment==34 | attainment==35 | attainment==36 | attainment==37 | attainment==40 | attainment==43 | attainment==62 | attainment==72 | attainment==82 | attainment==92
tab ncea

* Checking duplicates for 'snz_uid' to uniquely identify observations and to merge the file with 'moe_clean.student_per'
sort snz_uid
duplicates list snz_uid

* Keeping the highest value of ncea for duplicates with different ncea levels.
bysort snz_uid: egen id=max(ncea)
tab id ncea
keep if(ncea==id)
drop id

sort snz_uid
duplicates list snz_uid

bysort snz_uid: egen sex_check=max(sex)
duplicates list snz_uid if sex==sex_check
duplicates list snz_uid if sex!=sex_check
tab sex_check sex

drop if sex!=sex_check
drop sex_check

* Keeping the highest value of leaveryear for duplicates with different values. dropping the other ones
bysort snz_uid: egen dup=max(leaveryear)
tab dup leaveryear
keep if(dup==leaveryear)
drop dup

sort snz_uid
duplicates list snz_uid

* Keeping the highest age for duplicates with different ages. dropping the other ones
bysort snz_uid: egen age_check=max(age)
tab age_check age
keep if(age_check==age)
drop age_check

sort snz_uid
duplicates list snz_uid

* Checking and removing duplicates (dropping first occurrence, keeping second occurrence)
sort snz_uid
quietly by snz_uid: gen dup = cond(_N==1,0,_n)
tab dup
drop if dup==1
drop dup

sort snz_uid
duplicates list snz_uid

* Merging with 'moe_clean.student_per' to add additional required variables
merge 1:m snz_uid using "XXX\moe_clean.student_per.dta"
keep if _merge==3
tab _merge
drop _merge

* Renaming and labeling the variables for ease.
rename moe_spi_birth_month_nbr dobm
label variable dobm "moe_spi_birth_month_nbr"
rename moe_spi_birth_year_nbr doby
label variable doby "moe_spi_birth_year_nbr"
rename moe_spi_iwi1_text iwi1
label variable iwi1 "moe_spi_iwi1_text"
rename moe_spi_iwi2_text iwi2
label variable iwi2 "moe_spi_iwi2_text"
rename moe_spi_iwi3_text iwi3
label variable iwi3 "moe_spi_iwi3_text"
rename moe_spi_eth1_text eth1
label variable eth1 "moe_spi_eth1_text"
rename moe_spi_eth2_text eth2
label variable eth2 "moe_spi_eth2_text"
rename moe_spi_eth3_text eth3
label variable eth3 "moe_spi_eth3_text"
rename moe_spi_region_code region
label variable region "moe_spi_region_code"
rename moe_spi_ta_code ta
label variable ta "moe_spi_ta_code"
rename moe_spi_ethnic_grp1_snz_ind ethgrp1
label variable ethgrp1 "moe_spi_ethnic_grp1_snz_ind"
rename moe_spi_ethnic_grp2_snz_ind ethgrp2
label variable ethgrp2 "moe_spi_ethnic_grp2_snz_ind"
rename moe_spi_ethnic_grp3_snz_ind ethgrp3
label variable ethgrp3 "moe_spi_ethnic_grp3_snz_ind"
rename moe_spi_ethnic_grp4_snz_ind ethgrp4
label variable ethgrp4 "moe_spi_ethnic_grp4_snz_ind"
rename moe_spi_ethnic_grp5_snz_ind ethgrp5
label variable ethgrp5 "moe_spi_ethnic_grp5_snz_ind"
rename moe_spi_ethnic_grp6_snz_ind ethgrp6
label variable ethgrp6 "moe_spi_ethnic_grp6_snz_ind"
rename moe_spi_sex_snz_code sex2
label variable sex2 "moe_spi_sex_snz_code"

tab moe_spi_ethnic1_snz_code
tab moe_spi_ethnic2_snz_code
tab moe_spi_ethnic3_snz_code
tab moe_spi_sex_text

drop moe_spi_ethnic1_snz_code 
drop moe_spi_ethnic2_snz_code 
drop moe_spi_ethnic3_snz_code 
drop moe_spi_sex_text

sort snz_uid
duplicates list snz_uid

* Checking gender difference from both files (student_leavers and student_per)
list sex if sex!=sex2
drop if sex!=sex2

* Destringing the 'moe_spi_mod_address_date' variable
encode moe_spi_mod_address_date, generate(address_date)

* Keeping the latest value of address_date for duplicates with different address dates.
bysort snz_uid: egen address=max(address_date)
keep if (address_date==address)

sort snz_uid
duplicates list snz_uid

tab moe_sl_snz_unique_nbr
tab moe_spi_snz_unique_nbr

* Checking and removing duplicates (dropping first occurance, keeping second occurance)
sort snz_uid
quietly by snz_uid: gen dup = cond(_N==1,0,_n)
tab dup
drop if dup==1
drop dup

sort snz_uid
duplicates list snz_uid

* Repeating: Checking and removing duplicates (dropping first occurrence, keeping second occurrence)
sort snz_uid
quietly by snz_uid: gen dup = cond(_N==1,0,_n)
tab dup
drop if dup==1
sort snz_uid
duplicates list snz_uid
drop dup

sort dobm doby

* Inputting/merging in external data with generated random DOB: dobr (random date of birth), penrollr (randomly generated maximum length of schooling), agem (age in months), agey (age in years) ... e.g., as in 'NZ Penroll.xlsx'... or generating these variables in Stata

replace ncea=99 if ncea==.

* Creating ncea dummies for ordered logit model. (where: ncea0=ncea level 1 not achieved; ncea1=atleast ncea level 1 achieved; ncea2=atleast ncea level 2 achieved; ncea3=atleast ncea level 3 achieved)
gen ncea0 = ncea==0
gen ncea1 = ncea==1|ncea==2|ncea==3
gen ncea2 = ncea==2|ncea==3
gen ncea3 = ncea==3

replace ncea0=. if ncea==99
replace ncea1=. if ncea==99
replace ncea2=. if ncea==99
replace ncea3=. if ncea==99

tab ncea
tab ncea0
tab ncea1
tab ncea2
tab ncea3

sort schoolno

* Next, isolate school deciles from the data file 'moe_clean.enrolment' to merge with this data file. For details on how this was created see the do file 'School Decile'.
* Merging master file with data file 'School Decile' to get the decile for every schoolno

merge m:1 schoolno using "XXX\School Decile.dta"
drop if _merge==2

* Finding values for missing 'school_decile' from an external source from Ministry of Education (file name: Student rolls by School 2009-2017). The file is avaialable online. Putting values of year 2015.
tab schoolno if _merge==1
tab schoolno if school_decile==.
replace school_decile=7 if schoolno==539
replace school_decile=2 if schoolno==558
replace school_decile=7 if schoolno==627
replace school_decile=. if schoolno==701
replace school_decile=. if schoolno==713
replace school_decile=. if schoolno==742
replace school_decile=. if schoolno==750
replace school_decile=2 if schoolno==1076
replace school_decile=3 if schoolno==1095
replace school_decile=4 if schoolno==1383
replace school_decile=2 if schoolno==1581
replace school_decile=4 if schoolno==1877
replace school_decile=2 if schoolno==2072
replace school_decile=10 if schoolno==2865
replace school_decile=8 if schoolno==4035
replace school_decile=. if schoolno==4929
replace school_decile=. if schoolno==4930
replace school_decile=. if schoolno==5570

replace school_decile=99 if school_decile==.

* Dropping unusual/impossible dobytab doby
drop if doby==1954 | doby==1960 | doby==1963 | doby==1966 | doby==1969 | doby==1972 | doby==1980 | doby==1981 | doby==1982 | doby==1970 | doby==1984 | doby==1986 | doby==1987 | doby==2003 | doby==2010 | doby==2011
tab doby

* only using the primary ethnicity. dropping other ones
drop eth2 eth3
drop iwi2 iwi3

rename eth1 ethnicity

* creating another ethnicity variable 'eth' and categorizing ethnicity into smaller groups
gen eth = ethnicity
notes eth : New Zealand European=111; Maori=211; Australian=128; European=129; Pacific People=371; Asian=444; Other Ethnicity (including Middle Eastern, Latin American, and African)=611; Not Stated=999
tab eth
drop if eth==100 | eth==300
replace eth=129 if eth==121 | eth==122 | eth==123 | eth==124 | eth==125 | eth==126 | eth==127
replace eth=371 if eth==311 | eth==321 | eth==331 | eth==341 | eth==351 | eth==361
replace eth=444 if eth==411 | eth==412 | eth==413 | eth==414 | eth==421 | eth==431 | eth==441 | eth==442 | eth==443
replace eth=611 if eth==511 | eth==521 | eth==531
replace eth=999 if eth==944
tab eth

notes school_region: Northland Region=1; Auckland Region=2; Waikato Region=3; Bay of Plenty Region=4; Gisborne Region=5; Hawkes Bay Region=6; Taranaki Region=7; Manawatu-Whanganui Region=8; Wellington Region=9; West Coast Region=12; Canterbury Region=13; Otago Region=14; Southland Region=15; Tasman Region=16; Nelson Region=17; Marlborough Region=18
drop region

* creating another region variable 'reg' and categorizing region into smaller groups
gen reg=school_region
replace reg=1 if reg==2 | reg==3 | reg==4 | reg==5
replace reg=2 if reg==6 | reg==7 | reg==8 | reg==9
replace reg=3 if reg==12 | reg==13| reg==16 | reg==17 | reg==18
replace reg=4 if reg==14 | reg==15
notes reg: Northland, Auckland, Waikato, Bay of Plenty, Gisborne=1; Hawkes Bay Region, Taranaki, Manawatu-Whanganui, Wellington=2; Canterbury, West Coast, Tasman, Nelson, Marlborough=3; Otago, Southland=4
tab reg

replace school_region=99 if school_region==.

g agem2 = agem^2/100
label var agem2 "Age^2/100"

* Creating Decile subgroups
* dec1=School Deciles 1, 2, 3 & 4; dec2=School Deciles 5, 6 & 7; dec3=School Deciles 8, 9 & 10
gen dec1=.
gen dec2=.
gen dec3=.
replace dec1=1 if school_decile==1 | school_decile==2  | school_decile==3 | school_decile==4  
replace dec2=1 if school_decile==5 | school_decile==6  | school_decile==7
replace dec3=1 if school_decile==8  | school_decile==9  | school_decile==10
tab dec1 school_decile
tab dec2 school_decile
tab dec3 school_decile

* Creating new variable for ordered logit and ordered probit
tab ncea
gen ncea_ologit=ncea
tab ncea_ologit
replace ncea_ologit=. if ncea_ologit==99
tab ncea_ologit

* Saving file as 'Working File'
save "XXX\Working File.dta"


****************************************************************************************************************************************************************************
* REGRESSIONS
******************************************************************************************
* WITH: CLUSTER BY SCHOOL; SCHOOL FIXED EFFECTS; YEAR X REGION INTERACTIONS
* Exogeneity Check
** OLS
regress penrollr agem agem2 i.sex i.eth i.schoolno i.doby i.doby#school_region, robust cluster(schoolno)

* Outcomes Regression
** OLS
regress ncea1 penrollr agem agem2 i.sex i.eth i.schoolno i.doby i.doby#school_region, robust cluster(schoolno)
regress ncea2 penrollr agem agem2 i.sex i.eth i.schoolno i.doby i.doby#school_region, robust cluster(schoolno)
regress ncea3 penrollr agem agem2 i.sex i.eth i.schoolno i.doby i.doby#school_region, robust cluster(schoolno)
regress ue penrollr agem agem2 i.sex i.eth i.schoolno i.doby i.doby#school_region, robust cluster(schoolno)

****************************************************************************************************************************************************************************
* SUBSAMPLE REGRESSIONS
****************************************************************************************************************************************************************************

* CLASSIFICATION BY GENDER
** female: if _Isex_2==1
** male: if _Isex_2==0
* CLASSIFICATION BY ETHNICITY
** New Zealand European: if eth==111
** Maori: if eth==211
** Asian: if eth==444
* CLASSIFICATION BY SCHOOL DECILE
** dec1=School Deciles 1, 2, 3 & 4: if dec1==1
** dec2=School Deciles 5, 6 & 7: if dec2==1
** dec3=School Deciles 8, 9 & 10: if dec3==1


** For placebo1
*** keep if status==1 | status==3
*** drop if elig==2

** For placebo2
*** keep if elig==3

*************************************************
* ORDERED LOGIT AND ORDERED PROBIT

ologit ncea_ologit penrollr agem  agem2 i.sex i.eth i.doby, robust cluster(schoolno)
mfx, predict(outcome(0)) 
ologit ncea_ologit penrollr agem  agem2 i.sex i.eth i.doby, robust cluster(schoolno)
mfx, predict(outcome(1)) 
ologit ncea_ologit penrollr agem  agem2 i.sex i.eth i.doby, robust cluster(schoolno)
mfx, predict(outcome(2)) 
ologit ncea_ologit penrollr agem  agem2 i.sex i.eth i.doby, robust cluster(schoolno)
mfx, predict(outcome(3)) 

oprobit ncea_ologit penrollr agem  agem2 i.sex i.eth i.doby, robust cluster(schoolno)
mfx, predict(outcome(0)) 
oprobit ncea_ologit penrollr agem  agem2 i.sex i.eth i.doby, robust cluster(schoolno)
mfx, predict(outcome(1)) 
oprobit ncea_ologit penrollr agem  agem2 i.sex i.eth i.doby, robust cluster(schoolno)
mfx, predict(outcome(2)) 
oprobit ncea_ologit penrollr agem  agem2 i.sex i.eth i.doby, robust cluster(schoolno)
mfx, predict(outcome(3)) 


